knitr::opts_chunk$set(
    echo = TRUE,
    message = FALSE,
    warning = FALSE
)

library(tidyverse) #used for data wrangling and viz
library(here) #simplifies file paths
library(rsample) #used to split data
library(janitor) #used to clean data names
library(corrplot) #for correlation viz
library(VIM) #for missing data viz
library(missMDA) #for imputing missing data
library(ggfortify) #for PCA viz
library(fastDummies) #for dummy coding
library(caret) #for cross validation
library(class) #for knn
library(gbm) #for boosted trees modeling
library(randomForest) #for random forest modeling
library(maptools)
library(RColorBrewer)
library(classInt)
library(ggplot2)
library(ggrepel)
library(mapproj)
library(viridis)
library(pander)
library(knitr)
#turns scientific notation off
options(scipen = 100)

#some of our workflow includes randomness. here we set a seed so our workflow can be reproduced without worrying about random variation
set.seed(123) 

1 Introduction

The purpose of this project is to use machine learning techniques to predict which wildfires will grow to a catastrophic size. Our most accurate model will then be used to assess how global climate change may influence a wildfires predicted size.

1.1 Background

fire situation in North America, climate change, etc

1.2 Why this model is useful

What will it be used for?

1.3 Data and Packages Used

This dataset is available on Kaggle. It is a subset of larger fires

Important attributes include

  • fire_size - the size of the fire in acres

  • stat_cause_descr - the cause of the fire

  • vegetation - code corresponding to vegetation type

  • temp_cont - temperature (Celsius) when the fire was contained

A complete codebook is available in the project file

#read in dataset
us_wildfire <- read_csv(here("archive", "FW_Veg_Rem_Combined.csv"))
#Following to the National Geographic Education
us_wildfire$region = "NorthEast"
us_wildfire[which(us_wildfire$state %in% c("AK", "WA", "OR", "CA", "NV", "ID", "MT", "WY", "UT", "CO")),"region"] = "West"
us_wildfire[which(us_wildfire$state %in% c("AZ", "NM", "TX", "OK")), "region"] = "SouthWest"
us_wildfire[which(us_wildfire$state %in% c("ND", "SD", "NE", "KS", "MN", "IA", "MO", "WI", "IL", "MI", "IN", "OH")), "region"] = "MidWest"
us_wildfire[which(us_wildfire$state %in% c("AR", "LA", "MS", "AL", "GA", "FL", "SC", "NC", "TN", "KY", "VA", "WV")), "region"] = "SouthEast"

us_wildfire$year_diff = us_wildfire$disc_pre_year - min(us_wildfire$disc_pre_year)

2 Methods

Our methods involved cleaning the data, conducting exploratory analysis, imputing missing data, and using Principle Components Analysis to reduce the number of features.

2.1 Cleaning Data

To clean the data we first converted all column names to lower snake case using the clean_names() function. Then we selected only the columns were were interested in using, thereby removing all the meaningless columns.

us_wildfire_clean <- us_wildfire %>% 
  dplyr::select(fire_name:year_diff) %>% #select columns we want to use
  clean_names() #convert to lowercase snake

We are interested in using weather to predict fire size, so we filter out observations where the weather data was not associated with the fire observation.

There are some weather observations for which every weather field is 0. This seems unlikely, especially for temperature and humidity observations. So we will replace them with NAs. Since precipitation is frequently 0, we will not replace all 0 with NAs in the precipitation column.

Instead, we will follow the pattern of NAs seen in other the other weather columns. We can see that there is a clear pattern in the missing weather data. when temp_pre_30 is missing, so is wind_pre_30 and humidity_pre_30. We will assume this extends to prec_pre_30 and replace that 0 with a NA.

#filter out observations that do not have a weather file
us_wildfire_clean <- us_wildfire_clean %>% 
  filter(weather_file != "File Not Found")

#replace 0 with NA
us_wildfire_clean <- us_wildfire_clean %>%
  mutate_at(vars(temp_pre_30:hum_cont), ~na_if(., 0.0000000))

#when temp, wind, and humidity are missing, we suspect precipitation is as well.
#we use a case when statement to replace these zeros with NAs
us_wildfire_clean <- us_wildfire_clean %>% 
  mutate(prec_pre_30 = case_when(is.na(temp_pre_30) & is.na(hum_pre_30) & is.na(wind_pre_30) ~ NA_real_,
                                      TRUE ~ prec_pre_30)) %>% 
  mutate(prec_pre_15 = case_when(is.na(temp_pre_15) & is.na(hum_pre_15) & is.na(wind_pre_15) ~ NA_real_,
                                      TRUE ~ prec_pre_15)) %>% 
  mutate(prec_pre_7 = case_when(is.na(temp_pre_7) & is.na(hum_pre_7) & is.na(wind_pre_7) ~ NA_real_,
                                      TRUE ~ prec_pre_7)) %>% 
  mutate(prec_cont = case_when(is.na(temp_cont) & is.na(hum_cont) & is.na(wind_cont) ~ NA_real_,
                                      TRUE ~ prec_cont))

Information about vegetation was stored in a numeric form, where each number represented a class of vegetation. The key for the vegetation codes was in the provided metadata. For ease of interpretation, we replaced the numeric code with a short description the vegetation type.

According the metadata for this data set, the vegetation was created by interpolating most likely vegetation based on latitude and longitude. The most common vegetation type is listed as “Polar Desert/Rock/Ice” and this seems very unlikely. Although we keep the vegetation feature in the data, we have doubts about its accuracy.

#Here we label vegetation according to the provided codebook
us_wildfire_clean <- us_wildfire_clean %>% 
  mutate(vegetation_classed = case_when(
    vegetation == 12 ~ "Open Shrubland",
    vegetation == 15 ~ "Polar Desert/Rock/Ice",
    vegetation == 16 ~ "Secondary Tropical Evergreen Broadleaf Forest",
    vegetation == 4 ~ "Temperate Evergreen Needleleaf Forest TmpENF",
    vegetation == 9 ~ "C3 Grassland/Steppe",
    vegetation == 14 ~ "Desert"
  ))

Our final data cleaning action was to clean up date columns. Since there are redundant date columns, we chose to keep only month and year as variables. We also reclassified months into seasons, so there would be fewer factor levels or dummy coding involved when it came time to prepare the data for modeling.

#keep month and year as variables
us_wildfire_clean <- us_wildfire_clean %>% 
  dplyr::select(-disc_clean_date, 
         -disc_date_pre, 
         -cont_clean_date, 
         -disc_pre_month,
         -disc_date_final,
         -cont_date_final,
         -putout_time)

#we also reclassify months into seasons, to reduce factor levels
us_wildfire_clean <- us_wildfire_clean %>% 
  mutate(season = case_when(discovery_month %in% c("Apr", "May", "Jun") ~ "Spring",
                            discovery_month %in% c("Jul", "Aug", "Sep") ~ "Summer",
                            discovery_month %in% c("Oct", "Nov", "Dec") ~ "Fall",
                            discovery_month %in% c("Jan", "Feb", "Mar") ~ "Winter")) %>% 
  select(-discovery_month)

2.2 Exploratory Analysis

Our next step was to explore the data. We created several figures to investigate fire observations, as well as the pattern of missing data and the correlation between variables.

First we investigated fire size through time. This figure shows that the total acrage burned per year has been increasing almost linearly since 1990. This is a clue that Year might be a useful feature in our models.

#summarise acres per year burned
acres_per_year <- us_wildfire_clean %>% 
  group_by(disc_pre_year) %>% 
  summarise(acres_burned = sum(fire_size))

#fire size (finalized graph)
ggplot(data = acres_per_year) + 
  geom_point(aes(x = disc_pre_year, 
                 y = acres_burned, 
                 size = acres_burned, 
                 color = acres_burned)) +
  scale_color_continuous(high = "firebrick", low = "goldenrod1") +
  labs(x = "Year", y = "Total Acres Burned", 
       title = "Total acres burned per year from 1990 to 2015") +
  theme_minimal() +
  theme(legend.position = "none")

We also looked at the most common causes of fires. We found that most fires were started by debris burning, followed by arson. Yikes! We were also surprised that a children were frequently listed as the cause of wildfires.

#most common causes of fire
fire_causes <- us_wildfire_clean %>% 
  group_by(stat_cause_descr) %>% 
  count()

#cause (finalized)
ggplot(data = fire_causes, aes(y = reorder(stat_cause_descr, n), x = n)) +
  geom_col(aes(fill = n)) +
  scale_fill_gradient(high = "firebrick", low = "goldenrod1") +
  labs(x = "Number of Fires", 
       y = "Cause",
       tite = "Number of fires per listed starting cause") +
  theme_minimal() +
  theme(legend.position = "none")

2.2.1 Map of regions

us_wildfire_clean$class_fac = factor(us_wildfire_clean$fire_size_class, levels = c("A", "B", "C", "D", "E", "F", "G"))

us <- map_data("world", 'usa')

state <- map_data("state")

state$region2 = "NorthEast"
state[which(state$region %in% c("alaska", "washington", "oregon", "california", "nevada", "idaho", "montana", "utah", "wyoming", "colorado")), "region2"] = "West"
state[which(state$region %in% c("arizona", "new mexico", "oklahoma", "texas")), "region2"] = "SouthWest"
state[which(state$region %in% c("north dakota", "south dakota", "nebraska", "kansas", "minnesota", "iowa", "missouri", "wisconsin", "illinois", "indiana", "michigan", "ohio")), "region2"] = "MidWest"
state[which(state$region %in% c("arkansas", "louisiana", "mississippi", "alabama", "florida", "georgia", "south carolina", "north carolina", "tennessee", "kentucky", "virginia", "west virginia")), "region2"] = "SouthEast"

#state$region = as.factor(state$region)

ggplot(data=state, aes(x=long, y=lat, group = region)) + 
  geom_polygon(aes(fill=region2)) +
  ggtitle("US Region")+
  guides(fill=guide_legend(title="Region"))+
  coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))

2.2.2 Map of fires

2.2.2.1 The spatial distribution of wildifres

ggplot() + 
  geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") + 
  geom_point(data = us_wildfire_clean, aes(x=longitude, y = latitude, color = class_fac)) +
  scale_color_brewer(palette = "YlOrRd")+
  ggtitle("US Wildfire Distribution")+
  guides(color=guide_legend(title="Wild Fire Scale"))+
  coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))

2.2.3 When we divide it to three periods, we can see the wildfire risk has been growing in Western parts of the US.

ggplot() + 
  geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") + 
  geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear < 1970),], aes(x=longitude, y = latitude, color = class_fac)) +
  scale_color_brewer(palette = "YlOrRd")+
  ggtitle("US Wildfire Distribution before 1970")+
  guides(color=guide_legend(title="Wild Fire Scale"))+
  coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))

ggplot() + 
  geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") + 
  geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 1970 & us_wildfire_clean$wstation_byear < 2000),], aes(x=longitude, y = latitude, color = class_fac)) +
  scale_color_brewer(palette = "YlOrRd")+
  ggtitle("US Wildfire Distribution 1970-2000")+
  guides(color=guide_legend(title="Wild Fire Scale"))+
  coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))

ggplot() + 
  geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") + 
  geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 200),], aes(x=longitude, y = latitude, color = class_fac)) +
  scale_color_brewer(palette = "YlOrRd")+
  ggtitle("US Wildfire Distribution after 2000")+
  guides(color=guide_legend(title="Wild Fire Scale"))+
  coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))

2.2.4 Density graph

ggplot() + 
  geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear <= 1970 & us_wildfire_clean$fire_size > 100),], aes(x = fire_size, y=..density..),
               alpha=.3,
               colour="dodgerblue", fill="dodgerblue") + 
  geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 1970 & us_wildfire_clean$fire_size > 100 & us_wildfire_clean$wstation_byear < 2000),], aes(x = fire_size, y=..density..),
               alpha=.3,
               colour="yellow3", fill="yellow3") + 
  geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 2000 & us_wildfire_clean$fire_size > 100),], aes(x = fire_size, y=..density..),
               alpha=.3,
                 colour="firebrick3", fill="firebrick3") + 
  xlim(10000, 100000) + 
  ggtitle("Wildfire Severeity")

2.2.5 Exploring Missing Data and Correlation

As part of our exploratory analysis, we we looked at the pattern of missing data. This figure shows missing data in red. You can see that most missing data is concentrated in the weather columns.

#missing data plot
aggr_plot <- aggr(us_wildfire_clean, 
                  col = c('yellow','red'), 
                  numbers = TRUE, 
                  sortVars = TRUE, 
                  labels = names(us_wildfire_clean), 
                  cex.axis = .7, 
                  gap = 2, 
                  ylab = c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##            Variable      Count
##           fire_name 0.50282019
##            hum_cont 0.42786638
##           wind_cont 0.41558884
##           temp_cont 0.41002139
##           prec_cont 0.41002139
##  vegetation_classed 0.17480307
##           hum_pre_7 0.14329476
##          hum_pre_15 0.12783234
##          wind_pre_7 0.12250802
##          temp_pre_7 0.11173782
##          prec_pre_7 0.11173782
##         wind_pre_15 0.10663231
##         temp_pre_15 0.09571623
##         prec_pre_15 0.09571623
##          hum_pre_30 0.09413595
##         wind_pre_30 0.07308179
##         temp_pre_30 0.06216571
##         prec_pre_30 0.06216571
##           fire_size 0.00000000
##     fire_size_class 0.00000000
##    stat_cause_descr 0.00000000
##            latitude 0.00000000
##           longitude 0.00000000
##               state 0.00000000
##       disc_pre_year 0.00000000
##       wstation_usaf 0.00000000
##          dstation_m 0.00000000
##       wstation_wban 0.00000000
##      wstation_byear 0.00000000
##      wstation_eyear 0.00000000
##          vegetation 0.00000000
##            fire_mag 0.00000000
##        weather_file 0.00000000
##          remoteness 0.00000000
##              region 0.00000000
##           year_diff 0.00000000
##              season 0.00000000
##           class_fac 0.00000000

We hypothesized that weather variables were correlated. To test this out, we created a correlation matrix.

This figure shows that their is a strong correlation with each set of variables. I.e. the 7 day average temperature is correlated with the 30 day average temperature. This is not surprising!

#create a dataframe of weather
weather <- us_wildfire_clean %>% 
  dplyr::select(temp_pre_30:prec_cont)

#create a correlation matrix (omitting NAs)
cor_matrix <- cor(weather, use = "complete.obs")

#create a visualization
corrplot(cor_matrix, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

2.3 Model Preparation

After exploring our data, we were ready to prepare it for modeling. We

2.3.1 Splitting Data

We split our data into 80% training and 20% testing data. Because fire size is heavily skewed towards smaller fires, we used stratified sampling.

There are xyz observations in the training set and xyz observations in the test set.

#first we make a dataframe containing only variables we want to use in our models
fire_modeling_df <- us_wildfire_clean %>%
  dplyr::select(-fire_name, #remove fire name
         -fire_size_class, #remove fire size class
         -class_fac, #which is also a size class
         -state, #because we have regions
         -disc_pre_year, #because we have year_diff to represent year
         -wstation_usaf, #remove weather station name
         -wstation_wban, #remove this (not described in codebook)
         -wstation_byear, #remove station year installed
         -wstation_eyear, #remove station year ended data recording
         -weather_file, #remove name of weather file
         -dstation_m, #remove distance of weather station to fire
         -vegetation #remove vegetaiton because we already have it classed
         )

#define split parameters
us_wildfire_split <- fire_modeling_df %>% 
  initial_split(prop = 0.8, strata = "fire_size")

#write split data to data frames
fire_train <- training(us_wildfire_split)
fire_test <- testing(us_wildfire_split)

#set up folds in training data for cross validation
train_folds <- vfold_cv(fire_train, v = 10, repeats = 5)

2.3.2 Imputing Missing Data

#select weather cols     
weather_train <- fire_train %>%
  select(temp_pre_30:prec_cont)

weather_test <- fire_test %>%
  select(temp_pre_30:prec_cont)

#select cols not in weather
notweather_train <- fire_train %>%
  select(-colnames(weather_train))

notweather_test <- fire_test %>%
  select(-colnames(weather_test))

#imputation of weather data
weather_train_imputed <- imputePCA(weather_train, ncp=4)


weather_test_imputed <- imputePCA(weather_test, ncp=4)

2.3.3 Principle Components

#Do PCA on both test and train data separately
weather_train_PCA <- weather_train_imputed$completeObs %>%
  scale(center = T, scale = T) %>%  #first scale and center the data
  prcomp(rank. = 4) #do PCA


#weather_train_PCA$x

#Do PCA on both test and train data separately
weather_test_PCA <- weather_test_imputed$completeObs %>%
  scale(center = T, scale = T) %>%  #first scale and center the data
  prcomp(rank. = 4)

#Make a PCA bi-plot
autoplot(weather_train_PCA, 
         data = weather_train,
         loadings = TRUE, 
         loadings.label = TRUE,
         loadings.label.hjust = 1.2) +
  theme_classic() +
  labs(caption = "Principle Component Analysis Bi-plot of Weather Training Data")

#put data back together

#bind imputed weather data to rest of rows
fire_train_complete <- cbind(notweather_train, weather_train_PCA$x) %>% 
  na.omit()

fire_test_complete <- cbind(notweather_test, weather_test_PCA$x) %>% 
  na.omit()

2.4 Modeling As a Regression Problem

First of all, we applied multiple linear regression models on the dataset. The performance of the multiple linear regression will present why we need to apply machine learning approaches. First of all, I started with the whole variables including principal components. And then, less relevant variables were deleted from to find the optimal multiple linear regression model.

lm_whole = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete)
summary(lm_whole) %>% pander
Table continues below
  Estimate Std. Error t value
(Intercept) 3479 386.2 9.009
PC1 -776.3 38.71 -20.05
PC2 -137.7 38.73 -3.557
PC3 135.7 43.91 3.089
PC4 -599.8 55.41 -10.83
year_diff 60.03 12.21 4.915
remoteness -9364 631 -14.84
vegetation_classedDesert -1116 682.8 -1.634
vegetation_classedOpen Shrubland -1396 309.7 -4.507
vegetation_classedPolar Desert/Rock/Ice -633.6 306.8 -2.065
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest -1693 314.3 -5.388
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF -277.4 612.2 -0.453
stat_cause_descrCampfire 2090 522.5 4
stat_cause_descrChildren -649 514.8 -1.261
stat_cause_descrDebris Burning -243.9 234.2 -1.042
stat_cause_descrEquipment Use 577.3 334.5 1.726
stat_cause_descrFireworks -1440 1218 -1.183
stat_cause_descrLightning 4978 298 16.7
stat_cause_descrMiscellaneous 267.2 271.2 0.9854
stat_cause_descrMissing/Undefined 516.4 326.9 1.579
stat_cause_descrPowerline 971.8 760.9 1.277
stat_cause_descrRailroad -113.4 584.4 -0.194
stat_cause_descrSmoking -641 547.1 -1.172
stat_cause_descrStructure -635.4 1972 -0.3222
  Pr(>|t|)
(Intercept) 0.0000000000000000002214
PC1 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000008317
PC2 0.0003764
PC3 0.00201
PC4 0.000000000000000000000000002977
year_diff 0.000000894
remoteness 0.000000000000000000000000000000000000000000000000125
vegetation_classedDesert 0.1023
vegetation_classedOpen Shrubland 0.000006594
vegetation_classedPolar Desert/Rock/Ice 0.03892
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 0.0000000717
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF 0.6505
stat_cause_descrCampfire 0.00006338
stat_cause_descrChildren 0.2075
stat_cause_descrDebris Burning 0.2976
stat_cause_descrEquipment Use 0.08441
stat_cause_descrFireworks 0.2369
stat_cause_descrLightning 0.00000000000000000000000000000000000000000000000000000000000002494
stat_cause_descrMiscellaneous 0.3245
stat_cause_descrMissing/Undefined 0.1142
stat_cause_descrPowerline 0.2016
stat_cause_descrRailroad 0.8461
stat_cause_descrSmoking 0.2414
stat_cause_descrStructure 0.7473
Fitting linear model: fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
27229 12405 0.06151 0.06072
lm_whole2 <- update(lm_whole, . ~ . - stat_cause_descr)
summary(lm_whole2) %>% pander
Table continues below
  Estimate Std. Error t value
(Intercept) 5068 345.9 14.65
PC1 -1062 35.77 -29.68
PC2 -5.831 38.38 -0.1519
PC3 44.83 43.91 1.021
PC4 -627 55.42 -11.31
year_diff 64.25 12.06 5.327
remoteness -9112 632.6 -14.4
vegetation_classedDesert -1585 687 -2.308
vegetation_classedOpen Shrubland -2583 304.4 -8.486
vegetation_classedPolar Desert/Rock/Ice -1126 307.5 -3.663
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest -2958 308.2 -9.598
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF -359.6 616.5 -0.5833
  Pr(>|t|)
(Intercept) 0.000000000000000000000000000000000000000000000002028
PC1 1.609e-190
PC2 0.8792
PC3 0.3073
PC4 0.00000000000000000000000000001319
year_diff 0.0000001008
remoteness 0.0000000000000000000000000000000000000000000000728
vegetation_classedDesert 0.02103
vegetation_classedOpen Shrubland 0.00000000000000002247
vegetation_classedPolar Desert/Rock/Ice 0.0002502
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 0.0000000000000000000008819
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF 0.5597
Fitting linear model: fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
27229 12502 0.0464 0.04602

When we compare the whole region’s regression results, it shows that the updated model is superior than the others.

lm_sw = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete[which(fire_train_complete$region == "SouthWest"),])
summary(lm_sw) %>% pander
Table continues below
  Estimate Std. Error t value
(Intercept) 1533 1357 1.129
PC1 -614.9 92.89 -6.62
PC2 -608.1 89.88 -6.766
PC3 16.67 210.8 0.07906
PC4 -677.7 168.9 -4.013
year_diff 111.1 32.31 3.438
remoteness -8766 3743 -2.342
vegetation_classedPolar Desert/Rock/Ice -146.3 621.1 -0.2356
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest -504.7 424.5 -1.189
stat_cause_descrCampfire 7716 1496 5.159
stat_cause_descrChildren -1370 1709 -0.8016
stat_cause_descrDebris Burning -123.4 666.8 -0.1851
stat_cause_descrEquipment Use -609.6 805.4 -0.7568
stat_cause_descrFireworks -1617 3341 -0.484
stat_cause_descrLightning 2201 780.4 2.82
stat_cause_descrMiscellaneous 497.1 686.4 0.7241
stat_cause_descrMissing/Undefined 2822 913 3.091
stat_cause_descrPowerline 897.1 1228 0.7305
stat_cause_descrRailroad -836.6 2252 -0.3714
stat_cause_descrSmoking -821.2 1357 -0.6049
stat_cause_descrStructure 223.4 8070 0.02768
  Pr(>|t|)
(Intercept) 0.2588
PC1 0.00000000003888
PC2 0.00000000001442
PC3 0.937
PC4 0.00006076
year_diff 0.0005896
remoteness 0.01922
vegetation_classedPolar Desert/Rock/Ice 0.8138
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 0.2345
stat_cause_descrCampfire 0.0000002556
stat_cause_descrChildren 0.4228
stat_cause_descrDebris Burning 0.8532
stat_cause_descrEquipment Use 0.4492
stat_cause_descrFireworks 0.6284
stat_cause_descrLightning 0.004814
stat_cause_descrMiscellaneous 0.469
stat_cause_descrMissing/Undefined 0.002006
stat_cause_descrPowerline 0.4651
stat_cause_descrRailroad 0.7103
stat_cause_descrSmoking 0.5452
stat_cause_descrStructure 0.9779
Fitting linear model: fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
6326 13938 0.03939 0.03634
lm_sw2 <- update(lm_sw, . ~ . - stat_cause_descr - PC2 - remoteness)
summary(lm_sw2) %>% pander
Table continues below
  Estimate Std. Error t value
(Intercept) -451.6 570.2 -0.792
PC1 -715.1 82.59 -8.659
PC3 256.2 207 1.237
PC4 -600.3 166 -3.616
year_diff 92.57 31.1 2.977
vegetation_classedPolar Desert/Rock/Ice 1153 584.1 1.974
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 91.81 410.8 0.2235
  Pr(>|t|)
(Intercept) 0.4284
PC1 0.000000000000000005989
PC3 0.216
PC4 0.0003012
year_diff 0.002926
vegetation_classedPolar Desert/Rock/Ice 0.04838
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 0.8231
Fitting linear model: fire_size ~ PC1 + PC3 + PC4 + year_diff + vegetation_classed
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
6326 14040 0.0231 0.02217
lm_se = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete[which(fire_train_complete$region == "SouthEast"),])
summary(lm_se) %>% pander
Table continues below
  Estimate Std. Error t value
(Intercept) -2513 2241 -1.121
PC1 -39.35 29.38 -1.34
PC2 -66.19 21.04 -3.146
PC3 41.17 17.79 2.314
PC4 -19.53 31.4 -0.6219
year_diff 2.39 5.862 0.4078
remoteness 19539 468.8 41.67
vegetation_classedOpen Shrubland -955 2240 -0.4263
vegetation_classedPolar Desert/Rock/Ice -726.8 2242 -0.3242
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest -823.9 2240 -0.3678
stat_cause_descrCampfire 18.91 238 0.07945
stat_cause_descrChildren 635.3 218.4 2.909
stat_cause_descrDebris Burning 227.7 91.93 2.477
stat_cause_descrEquipment Use 426.1 161.3 2.641
stat_cause_descrFireworks 268.1 1041 0.2577
stat_cause_descrLightning 1358 156.8 8.66
stat_cause_descrMiscellaneous 258.4 134.9 1.916
stat_cause_descrMissing/Undefined 242.2 149.5 1.62
stat_cause_descrPowerline 292.9 918.3 0.319
stat_cause_descrRailroad 575.4 219.9 2.617
stat_cause_descrSmoking 343.5 248.8 1.381
stat_cause_descrStructure 168.4 894 0.1884
  Pr(>|t|)
(Intercept) 0.2623
PC1 0.1804
PC2 0.001662
PC3 0.0207
PC4 0.534
year_diff 0.6835
remoteness 0
vegetation_classedOpen Shrubland 0.6699
vegetation_classedPolar Desert/Rock/Ice 0.7458
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 0.713
stat_cause_descrCampfire 0.9367
stat_cause_descrChildren 0.003634
stat_cause_descrDebris Burning 0.01325
stat_cause_descrEquipment Use 0.008266
stat_cause_descrFireworks 0.7967
stat_cause_descrLightning 0.000000000000000005298
stat_cause_descrMiscellaneous 0.05545
stat_cause_descrMissing/Undefined 0.1052
stat_cause_descrPowerline 0.7498
stat_cause_descrRailroad 0.008878
stat_cause_descrSmoking 0.1674
stat_cause_descrStructure 0.8506
Fitting linear model: fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
12420 3878 0.1363 0.1349
lm_se2 <- update(lm_se, . ~ . - stat_cause_descr - PC1 - PC4- year_diff - vegetation_classed)
summary(lm_se2) %>% pander
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -3170 87.4 -36.27 7.704e-274
PC2 -33.67 19.94 -1.689 0.09134
PC3 31.52 16.36 1.927 0.05398
remoteness 19712 458.2 43.02 0
Fitting linear model: fire_size ~ PC2 + PC3 + remoteness
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
12420 3890 0.1298 0.1296
lm_mw = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete[which(fire_train_complete$region == "MidWest"),])
summary(lm_mw) %>% pander
Table continues below
  Estimate Std. Error t value
(Intercept) 4209 548.8 7.668
PC1 -384.9 57.72 -6.67
PC2 -118.3 46.16 -2.562
PC3 74.11 51.91 1.428
PC4 -195.5 76.26 -2.563
year_diff 8.26 13.92 0.5933
remoteness -15258 1846 -8.265
vegetation_classedDesert -328 544.9 -0.6019
vegetation_classedPolar Desert/Rock/Ice 209.4 183 1.144
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 1161 874.2 1.328
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF 2727 507.3 5.376
stat_cause_descrCampfire 201.1 507.3 0.3964
stat_cause_descrChildren -237.1 420.6 -0.5637
stat_cause_descrDebris Burning -314.8 260.1 -1.21
stat_cause_descrEquipment Use -249.2 342 -0.7285
stat_cause_descrFireworks -20.58 692 -0.02975
stat_cause_descrLightning 2348 380.9 6.163
stat_cause_descrMiscellaneous -135.2 288.2 -0.4693
stat_cause_descrMissing/Undefined 313.4 412 0.7607
stat_cause_descrPowerline 1460 794.3 1.838
stat_cause_descrRailroad -573.3 599.8 -0.9558
stat_cause_descrSmoking -621.6 600 -1.036
stat_cause_descrStructure -484.6 1374 -0.3526
  Pr(>|t|)
(Intercept) 0.00000000000002511
PC1 0.00000000003168
PC2 0.01046
PC3 0.1535
PC4 0.01043
year_diff 0.553
remoteness 0.0000000000000002281
vegetation_classedDesert 0.5473
vegetation_classedPolar Desert/Rock/Ice 0.2526
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 0.1845
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF 0.00000008365
stat_cause_descrCampfire 0.6918
stat_cause_descrChildren 0.573
stat_cause_descrDebris Burning 0.2262
stat_cause_descrEquipment Use 0.4664
stat_cause_descrFireworks 0.9763
stat_cause_descrLightning 0.0000000008322
stat_cause_descrMiscellaneous 0.6389
stat_cause_descrMissing/Undefined 0.4469
stat_cause_descrPowerline 0.06612
stat_cause_descrRailroad 0.3393
stat_cause_descrSmoking 0.3003
stat_cause_descrStructure 0.7244
Fitting linear model: fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
2442 4054 0.09542 0.08719
lm_mw2 <- update(lm_mw, . ~ . - stat_cause_descr - PC3 - year_diff - PC2)
summary(lm_mw2) %>% pander
Table continues below
  Estimate Std. Error t value
(Intercept) 4329 477.4 9.068
PC1 -503.9 51.22 -9.838
PC4 -149.5 72.39 -2.066
remoteness -13396 1804 -7.427
vegetation_classedDesert -381 545.1 -0.6989
vegetation_classedPolar Desert/Rock/Ice 185.1 180.8 1.024
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 812.7 869.6 0.9346
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF 3101 505.9 6.131
  Pr(>|t|)
(Intercept) 0.0000000000000000002438
PC1 0.0000000000000000000002007
PC4 0.03896
remoteness 0.0000000000001528
vegetation_classedDesert 0.4847
vegetation_classedPolar Desert/Rock/Ice 0.3061
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 0.3501
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF 0.000000001018
Fitting linear model: fire_size ~ PC1 + PC4 + remoteness + vegetation_classed
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
2442 4101 0.06889 0.06621
lm_w = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete[which(fire_train_complete$region == "West"),])
summary(lm_w) %>% pander
Table continues below
  Estimate Std. Error t value
(Intercept) 18251 2013 9.066
PC1 -430.3 205.8 -2.091
PC2 618 234 2.641
PC3 394 567.2 0.6947
PC4 448.2 386.2 1.161
year_diff 191 54.42 3.509
remoteness -44907 2174 -20.65
vegetation_classedDesert -1220 1602 -0.7619
vegetation_classedOpen Shrubland 2287 2677 0.8545
vegetation_classedPolar Desert/Rock/Ice -206.9 904.2 -0.2288
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 664.4 1380 0.4813
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF -1070 1706 -0.6272
stat_cause_descrCampfire 2662 2613 1.019
stat_cause_descrChildren -226.2 2936 -0.07705
stat_cause_descrDebris Burning 141.6 2011 0.0704
stat_cause_descrEquipment Use 1368 1877 0.7291
stat_cause_descrFireworks -3313 4275 -0.7749
stat_cause_descrLightning 1941 1635 1.187
stat_cause_descrMiscellaneous -618 1771 -0.3489
stat_cause_descrMissing/Undefined -2598 1975 -1.316
stat_cause_descrPowerline -970.7 3398 -0.2857
stat_cause_descrRailroad -1649 3868 -0.4264
stat_cause_descrSmoking -125.5 3331 -0.03767
stat_cause_descrStructure -3756 8487 -0.4425
  Pr(>|t|)
(Intercept) 0.0000000000000000001833
PC1 0.03657
PC2 0.008305
PC3 0.4873
PC4 0.2459
year_diff 0.0004545
remoteness 0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001829
vegetation_classedDesert 0.4462
vegetation_classedOpen Shrubland 0.3929
vegetation_classedPolar Desert/Rock/Ice 0.819
vegetation_classedSecondary Tropical Evergreen Broadleaf Forest 0.6303
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF 0.5306
stat_cause_descrCampfire 0.3083
stat_cause_descrChildren 0.9386
stat_cause_descrDebris Burning 0.9439
stat_cause_descrEquipment Use 0.466
stat_cause_descrFireworks 0.4385
stat_cause_descrLightning 0.2353
stat_cause_descrMiscellaneous 0.7271
stat_cause_descrMissing/Undefined 0.1884
stat_cause_descrPowerline 0.7752
stat_cause_descrRailroad 0.6699
stat_cause_descrSmoking 0.9699
stat_cause_descrStructure 0.6581
Fitting linear model: fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
4352 23592 0.1273 0.1227
lm_w2 <- update(lm_w, . ~ . - stat_cause_descr - PC3 - PC4 - vegetation_classed)
summary(lm_w2) %>% pander
Table continues below
  Estimate Std. Error t value
(Intercept) 19277 1158 16.65
PC1 -364.1 158.9 -2.291
PC2 637.5 200.1 3.186
year_diff 186.3 53.71 3.468
remoteness -46229 2047 -22.58
  Pr(>|t|)
(Intercept) 0.000000000000000000000000000000000000000000000000000000000002299
PC1 0.022
PC2 0.001452
year_diff 0.0005291
remoteness 7.264e-107
Fitting linear model: fire_size ~ PC1 + PC2 + year_diff + remoteness
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
4352 23593 0.1234 0.1226
lm_ne = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete[which(fire_train_complete$region == "NorthEast"),])
summary(lm_ne) %>% pander
Table continues below
  Estimate Std. Error t value
(Intercept) -46.72 47.27 -0.9884
PC1 -11.38 9.196 -1.237
PC2 -9.431 6.176 -1.527
PC3 -6.782 5.752 -1.179
PC4 10.73 9.761 1.099
year_diff 0.5557 2.002 0.2775
remoteness 1132 69.54 16.28
vegetation_classedDesert -24.33 67.02 -0.3631
vegetation_classedOpen Shrubland -87.43 80.46 -1.087
vegetation_classedPolar Desert/Rock/Ice 26.36 26.21 1.006
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF 5.826 35.97 0.162
stat_cause_descrCampfire -27.26 64.16 -0.425
stat_cause_descrChildren -28.82 82.97 -0.3474
stat_cause_descrDebris Burning -35.04 39.67 -0.8833
stat_cause_descrEquipment Use -30.7 67.41 -0.4555
stat_cause_descrFireworks -3.143 411.4 -0.007641
stat_cause_descrLightning -29.71 62.79 -0.4732
stat_cause_descrMiscellaneous -38.33 33.96 -1.129
stat_cause_descrMissing/Undefined 34.69 65.41 0.5304
stat_cause_descrPowerline -40.74 122.2 -0.3334
stat_cause_descrRailroad -21.1 121.9 -0.1731
stat_cause_descrSmoking -39.8 56.33 -0.7065
stat_cause_descrStructure -29.27 411.3 -0.07117
  Pr(>|t|)
(Intercept) 0.3231
PC1 0.2163
PC2 0.1269
PC3 0.2385
PC4 0.272
year_diff 0.7814
remoteness 0.0000000000000000000000000000000000000000000000000000002025
vegetation_classedDesert 0.7166
vegetation_classedOpen Shrubland 0.2774
vegetation_classedPolar Desert/Rock/Ice 0.3146
vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF 0.8713
stat_cause_descrCampfire 0.6709
stat_cause_descrChildren 0.7284
stat_cause_descrDebris Burning 0.3772
stat_cause_descrEquipment Use 0.6488
stat_cause_descrFireworks 0.9939
stat_cause_descrLightning 0.6361
stat_cause_descrMiscellaneous 0.2592
stat_cause_descrMissing/Undefined 0.5959
stat_cause_descrPowerline 0.7389
stat_cause_descrRailroad 0.8626
stat_cause_descrSmoking 0.48
stat_cause_descrStructure 0.9433
Fitting linear model: fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1689 409.1 0.1589 0.1478
lm_ne2 <- update(lm_ne, . ~ . - stat_cause_descr - vegetation_classed - year_diff - PC4 - PC3 - PC2)
summary(lm_ne2) %>% pander
Table continues below
  Estimate Std. Error t value
(Intercept) -42.51 13.64 -3.116
PC1 -11.98 6.932 -1.728
remoteness 1100 64.7 17
  Pr(>|t|)
(Intercept) 0.001861
PC1 0.0841
remoteness 0.000000000000000000000000000000000000000000000000000000000006434
Fitting linear model: fire_size ~ PC1 + remoteness
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1689 407.8 0.1545 0.1535

The whole region’s RMSE is 10,325.41. We use it for comparing the model’s performance. The model performance will be checked with RMSE.

whole_pred = predict(lm_whole2, newdata = fire_test_complete)

test_MSE <- mean((fire_test_complete$fire_size - whole_pred)^2)

#print test RMSE
whole_test_RMSE = sqrt(test_MSE)
sw_pred = predict(lm_sw2, newdata = fire_test_complete[which(fire_test_complete$region == "SouthWest"),])

sw_test_MSE <- mean((fire_test_complete$fire_size[which(fire_test_complete$region == "SouthWest")] - sw_pred)^2)

#print test RMSE
sw_test_RMSE = sqrt(sw_test_MSE)

se_pred = predict(lm_se2, newdata = fire_test_complete[which(fire_test_complete$region == "SouthEast"),])

se_test_MSE <- mean((fire_test_complete$fire_size[which(fire_test_complete$region == "SouthEast")] - se_pred)^2)

#print test RMSE
se_test_RMSE = sqrt(se_test_MSE)

mw_pred = predict(lm_mw2, newdata = fire_test_complete[which(fire_test_complete$region == "MidWest"),])

mw_test_MSE <- mean((fire_test_complete$fire_size[which(fire_test_complete$region == "MidWest")] - mw_pred)^2)

#print test RMSE
mw_test_RMSE = sqrt(mw_test_MSE)

w_pred = predict(lm_w2, newdata = fire_test_complete[which(fire_test_complete$region == "West"),])

w_test_MSE <- mean((fire_test_complete$fire_size[which(fire_test_complete$region == "West")] - w_pred)^2)

#print test RMSE
w_test_RMSE = sqrt(w_test_MSE)

ne_pred = predict(lm_w2, newdata = fire_test_complete[which(fire_test_complete$region == "NorthEast"),])

ne_test_MSE <- mean((fire_test_complete$fire_size[which(fire_test_complete$region == "NorthEast")] - ne_pred)^2)

#print test RMSE
ne_test_RMSE = sqrt(ne_test_MSE)

table_lm = data.frame(Region = c("Whole US", "North East", "Mid West", "West", "South West", "South East"),
                      RMSE = c(whole_test_RMSE, ne_test_RMSE, mw_test_RMSE, w_test_RMSE, sw_test_RMSE, se_test_RMSE),
                      NumberofObservation = c(length(whole_pred),length(ne_pred),length(mw_pred),length(w_pred),length(sw_pred),length(se_pred)))
kable(table_lm, caption = "Linear Model's RMSE")
Linear Model’s RMSE
Region RMSE NumberofObservation
Whole US 10325.410 6713
North East 20619.775 435
Mid West 5623.652 630
West 15608.521 1058
South West 11636.710 1581
South East 3281.264 3009

2.4.1 K Nearest Neighbor

KNN (short for K Nearest Neighbor) is a non-linear algorithm that works for both classification and regression problems. The basic premise behind the model is that it predicts the dependent variable based on how similar they are to other observations where the dependent variable is known.

KNN works by calculating euclidean distance between observations, so all inputs have to be numeric. Therefore, we have to do some pre-model data changes to our training and test sets. For both test data and training data we dummy code categorical variables, then we center and scale the data. As before, we assume that the training and test data are completely separated, so we process the two data sets separately.

#first we dummy code categorical variables (what about ordinal?)
knn_ready_train <- dummy_cols(fire_train_complete, select_columns = 
                                c("stat_cause_descr", "region", "vegetation_classed", "season")) %>% 
  #then we get rid of non-dummy coded variables
  select(-stat_cause_descr, - region, - vegetation_classed, -season) %>% 
  #then convert back to a data frame (as output for `dummy_cols` is a matrix)
  as.data.frame()

#then we center and scale the data, except our outcome
knn_ready_train[,-1] <- scale(knn_ready_train[,-1], center = T, scale = T)


#of course our next step is to do the same to the test data
knn_ready_test <- dummy_cols(fire_test_complete, 
                             select_columns = c("stat_cause_descr", "region", "vegetation_classed", "season")) %>% 
  select(-stat_cause_descr, - region, - vegetation_classed, -season) %>% 
  as.data.frame()

knn_ready_test[,-1] <- scale(knn_ready_test[,-1], center = T, scale = T)

Next we split up the training and test data into a data frame that has only independent variables and a data frame that has only dependent variables (in this case fire_size). We do this for both the test and the training data.

#YTrain is the true values for fire size on the training set 
YTrain = knn_ready_train$fire_size

#XTrain is the design matrix for training data
XTrain =  knn_ready_train %>% 
  select(-fire_size)
 
#YTest is the true value for fire_size on the test set
YTest = knn_ready_test$fire_size

#Xtest is the design matrix for test data
XTest = knn_ready_test %>% 
  select(-fire_size)

Then we use Leave One Out Cross Validation to determine the best number of neighbors to consider. A low number of neighbors considered results in a highly flexible model, while a higher number results in a less flexible model. For this process we built a function which we enter a starting value of K, an ending value of K, and the sampling interval. The result is a data frame of MSE values for each value of K that we test. This process is computationally intensive so we automatically saved results to a csv.

#make a function that saves KNN LOOCV results for different values of K
knn_loocv <- function(startk, endk, interval)
  {
  
#set up possible number of nearest neighbors to be considered 
allK = seq(from = startk, to = endk, by = interval)

#create a vector of the same length to save the MSE in later
k_mse = rep(NA, length(allK))

#for each number in allK, use LOOCV to find a validation error  
for (i in allK){  
  #loop through different number of neighbors
  #predict on the left-out validation set
  pred.Yval = knn.cv(train = XTrain, cl = YTrain, k = i) 
  #find the mse for each value of k
  k_mse[i] = mean((as.numeric(pred.Yval) - YTrain)^2)
}

#save as a data frame and filter out NAs (caused by skipping values of k if interval is larger than 1)
k_mse <- as.data.frame(k_mse) %>% 
  filter(!is.na(k_mse))

#bind with k value
knn_loocv_results <- cbind(as.data.frame(allK), k_mse)

#save MSE as CSV (because the cross validation process takes a long time)
write_csv(knn_loocv_results, paste0("model_results/knn/knn_mse_k", startk,"_k", endk, "_by", interval, ".csv"))

}

#we tried several different sets of k
knn_loocv(startk = 1, endk = 20, interval = 1)
knn_loocv(startk = 1, endk = 500, interval = 20)

Next, we go through our results for all the values of K that we tested. We find that the MSE increases as K increases.

#read in all the stored k values
knn_mse_k1_k500_by20 <- read_csv(here("model_results", "knn", "knn_mse_k1_k500_by20.csv"))

knn_mse_k1_k20_by1 <- read_csv(here("model_results", "knn", "knn_mse_k1_k20_by1.csv"))

#plot MSE for values of k
plot(knn_mse_k1_k500_by20$allK, knn_mse_k1_k500_by20$k_mse, type = "l", xlab = "k", ylab = "MSE", main = "MSE for K 1 through 500 at intervals of 20")

When we look at K 1-20 we get the lowest MSE at around 3 before it starts increasing.

plot(knn_mse_k1_k20_by1$allK, knn_mse_k1_k20_by1$k_mse, type = "l", xlab = "k", ylab = "MSE", main = "MSE for K 1 through 20 at intervals of 1")

Just to confirm, we will print out the best number of neighbors.

#best number of neighbors
numneighbor = max(knn_mse_k1_k20_by1$allK[knn_mse_k1_k20_by1$k_mse == min(knn_mse_k1_k20_by1$k_mse)])

#print best number of neighbors
numneighbor
## [1] 2

Now that we’ve found the best number of nearest neighbors to consider, we try out KNN on our test data! Below is the test MSE and the test RMSE.

#fit KNN model with best neighbor value to test data
pred.YTest = knn(train = XTrain, test = XTest, cl = YTrain, k = numneighbor)

#predict KNN
test_MSE <- mean((as.numeric(pred.YTest) - YTest)^2)

#print test MSE
test_MSE
## [1] 83360662
#print test RMSE
sqrt(test_MSE)
## [1] 9130.206

Over all, this model performs okay.

2.4.2 Boosted Tree

#train boosted tree model
fire_size_boost = gbm(fire_size~., 
                      data = fire_train,
                      n.trees = 500, 
                      interaction.depth = 4
                    )

#the model summary creates a pretty visualization
summary(fire_size_boost)

#calculate training error

#predict values using training data
predictions_train_boost <- predict(fire_size_boost, data = fire_train)

#calculate rmse for training data
RMSE(predictions_train_boost, fire_train$fire_size)

#calculate test error

#predict values using test data
predictions_test_boost <- predict(fire_size_boost, data = fire_test)

#calculate rmse for training data
RMSE(predictions_test_boost, fire_test$fire_size)

2.4.3 Random Forest

#train model
fire_size_rf = randomForest(fire_size ~ ., #writing the formula
                          data = fire_train_complete, #specifying training data to be used
                          mtry = 9, #setting number of variables to randomly sample per each split
                          ntree= 500, #setting number of trees
                          mode = "regression", #specifying regression
                          na.action = na.omit, #specifying what to do with NAs
                          importance = TRUE #specifying importance of variables should be assessed
                          )
#plot error
plot(fire_size_rf)

#plot variable importance
varImpPlot(fire_size_rf, 
           sort = T, 
           main = "Variable Importance for fire size random forest model", 
           n.var = 5)

#calculate training error

#predict values using training data
predictions_train_rf <- predict(fire_size_rf, data = fire_train)

#calculate rmse for training data
RMSE(predictions_train_rf, fire_train$fire_size)

#calculate test error

#predict values using test data
predictions_test_rf <- predict(fire_size_rf, data = fire_test)

#calculate rmse for training data
RMSE(predictions_test_rf, fire_test$fire_size)

what type of modeling will we do? we must try 4 types

  • neural network

3 Results

predictions without climate change

How do predictions change with 2.5C increase in temp?

4 Conclusion